**************************************************************************************************
***                                 Matched Inventor sample                                    ***                                      
***             The patent information is obtained from  Harvard Patent Inventor database      ***
***Inventor is assumed to move at the date of earlier patent application among the two patents ***      
***             LinkedIn data is compiled from the online career network, LinkedIn             ***
***             Commands for results on the survey sample are not in this do file              ***
**************************************************************************************************

use pat_lkn.dta,clear 

**create variables ** 
gen pataccu = 0  if !mi(echgyr) & !mi(echg360yr) 
replace pataccu = 1 if !mi(echgyr) & !mi(echg360yr) & echgyr == echg360yr 

gen patfirstno_ln = ln(patfirstno)
gen patfirstyr_ln = ln(patfirstyr)
gen patlastyr_ln = ln(patlastyr)
gen tenure_ln = ln(tenure + 1)
gen patyr_ln = ln(patyr+1) 
gen pat_breadth_ln =ln(pat_breadth)
gen coinventor_ln = ln(co_inventor)
gen patcareer_ln = ln(patlastyr - patfirstyr + 1)
gen patlife_ln = ln(patlife)

label variable pataccu "Accuracy"
label variable patfirstno_ln "First patent number (ln)"
label variable patfirstyr_ln "Year of first patent (ln)" 
label variable patlastyr_ln "Year of last patent (ln)"
label variable tenure_ln "Tenure (ln)"
label variable patyr_ln "Patent rate (ln)"
label variable pat_breadth_ln "Patent breadth (ln)"
label variable coinventor_ln "Co-inventors (ln)"
label variable patcareer_ln "Patent career (ln)"
label variable patlife_ln "Lifetime patents (ln)"

****************
*** Analysis ***
****************

*******Table 9 *****
global xlist patfirstyr_ln patcareer_ln patlife_ln self invtrq2-invtrq5 tech1-tech6
global xlist1  patfirstyr_ln patyr_ln pat_breadth_ln cites_avg  coinventor_ln   ///
	            cplxty sv tenure_ln tech1 - tech6 yr81_85 - yr01_05
global sample !mi(cites_avg) & !mi(cplxty) &!mi(patfirstno_ln) ///
	& !mi(patyr_ln) & !mi(coinventor_ln)
global option  cluster (lower_id)
global order patfirstyr_ln patcareer_ln patlife_ln self invtrq2-invtrq5  ///
	tenure_ln patyr_ln cites_avg pat_breadth_ln cplxty coinventor_ln sv  ///
	invtrq* tech* yr*
global report label addstat(Log Likelihood, e(ll), Inventors, e(N_clust)) ///
	dec(3) sortvar($order) drop(yr*) nocons word 

// Table 6(a)
probit pataccu $xlist if $sample , $option
outreg2 using "Table9", $report addtext(5-year f.e., No) replace
// Table 6(b)
probit fp $xlist1 if echgyr == 0 & $sample , $option 
outreg2 using "Table9", $report addtext(5-year f.e., Yes) append
// Table 6(c)
probit fn $xlist1 if echgyr == 1 & $sample , $option 
outreg2 using "Table9", $report addtext(5-year f.e., Yes) append
// Table 6(d)
probit fp $xlist if echgyr == 0 & $sample , $option 
predict fp_prd
outreg2 using "Table9", $report addtext(5-year f.e., No) append
// Table 6(e)
probit fn $xlist if echgyr == 1 & $sample , $option 
predict fn_prd
outreg2 using "Table9", $report addtext(5-year f.e., No) append


********Table 7 *******
**table 7 (a)-(d) ****
global ctrvar patfirstyr_ln 
global explvar tenure_ln patyr_ln cites_avg pat_breadth_ln cplxty coinventor_ln sv   
global tech tech1-tech6 
global year  yr81_85 - yr01_05  
global options cluster(lower_id)
global sample (!mi(echgyr) & !mi(echg360yr)) & !mi(patfirstno_ln) 
global report label addstat(Log Likelihood, e(ll), Inventors, e(N_clust)) ///
	dec(3) addtext(5-year f.e., Yes)  drop(yr*) nocons word 
	

** Table 7(a): based on all inventors with at lease 2 patents , need to refer 
** to Harvard patent dataset to replicate our result 
/*
probit echg360yr $ctrvar $explvar $tech $year  if !mi(patfirstno_ln), $options
outreg2 using "Table10", $report ctitle(Patent:probit) replace
*/

gen smpl_all = 1 if e(sample)
bysort lower (smpl_all): gen flag_all = 1 if _n == 1 & smpl_all == 1 

//Table 7(b)
probit echg360yr $explvar $tech $ctrvar $year if $sample, $options
outreg2 using "Table10", $report ctitle(Patent:probit) replace
	
gen smpl_match = 1 if e(sample) 
bysort lower (smpl_match): gen flag1 = 1 if _n == 1 & smpl_match == 1

//Table 7(c)	
probit echgyr $ctrvar $explvar $tech $year  if $sample, $options
outreg2 using "Table10", $report ctitle(LinkedIn:probit) append 

//Table 7(d) :  sometimes might need to type "ctrl+break"  to chnage numeric method 
**estimated false positive and false negative results are directed copy from the 
**regressiong output, statistics cannot be directly obtained 

mrprobit echg360yr $ctrvar $explvar $tech $year if $sample 
outreg2 using "Table10", ctitle(Patent:mrprobit) addtext(5-year f.e., Yes) ///
label addstat(Log Likelihood, e(ll)) dec(3) drop(yr*) nocons word append   

**Table 7(e) &(f) requires all patent inventors 
/*
heckprobit echgyr $ctrvar $explvar $tech $year if  !mi(echg360yr), ///
select(LinkedIn_Flag = patfirstno_ln  $ctrvar $explvar $tech $year ) $options 
outreg2 using "Table10", $report append 
*/ 

***Table 8 *****
xtset lower_id 

global options re vce (cluster lower_id)

//Table 8(a): requires all patent inventors 
/*
xtprobit echg360yr $explvar $tech $year if !mi(patfirstno_ln), $options 
outreg2 using "Table11", $report ctitle(Patent:xtprobit) replace
*/ 
 
//Table 8(b)
xtprobit echg360yr $explvar $tech $year if $sample, $options 
outreg2 using "Table11", $report ctitle(Patent:xtprobit) replace

//Table 8(c)
xtprobit echgyr $explvar $tech $year  if $sample, $options
outreg2 using "Table11", $report ctitle(LinkedIn:xtprobit) append
	

//below ----- Tables and Figures------------------------------------------
*------------------------------------------------------------------------*
**summary statistics 

//summary descriptive
global cond_summ !mi(patlife_ln) & !mi(cplxty) & !mi(cites_avg) & !mi(pat_breadth_ln) ///
	& !mi(echg360yr) & !mi(patyr_ln) & !mi(coinventor_ln) ///
	& !mi(tech) &!mi(patfirstno_ln) 
gen smpl = 1 if $cond_summ 	

gsort lower -LinkedIn_Flag -tech -invtr_pct -lkncareer smpl 
by lower: gen first_obs = 1 if _n == 1
replace first_obs = 0 if mi(first_obs)

//Requires all patent inventors 
/* Table 1 - column (a)
unique lower if $cond_summ
su echg360yr if $cond_summ
su invtr_pct patfirstyr patcareer tech1 - tech6 if first_obs == 1 & $cond_summ

//Table 5(a)
su tenure patyr cites_avg pat_breadth cplxty co_inventor  if $cond_summ 
su sv if first_obs == 1 & $cond_summ  */ 

//Table 1 - column (b)
unique lower  
su echg360yr echgyr fp fn 
su invtr_pct patfirstyr patcareer lkncareer tech1 - tech6 if first_obs == 1 

//Table 5(b) 
su tenure patyr cites_avg pat_breadth cplxty co_inventor 
su sv if first_obs == 1 

//Table 1 - column (c)
unique lower if  $cond_summ
su echg360yr echgyr fp fn if $cond_summ
su invtr_pct patfirstyr patcareer lkncareer  tech1 - tech6 if first_obs == 1 &  $cond_summ

//Table 5(c) 
su tenure patyr cites_avg pat_breadth cplxty co_inventor  if  $cond_summ
su sv if first_obs == 1 & $cond_summ

**Table 1(d) is using another set of data to obtain  

*************************
*******Figures **********
*************************
bysort year: egen num_of_lkn_obs = count(echgyr) if L == 1
bysort year: egen num_of_pat_obs = count(echg360yr) if L == 1

//draw Figure 1 - temporal coverage of the data
label variable num_of_pat_obs "Harvard Patent Inventor Database"
label variable num_of_lkn_obs "LinkedIn Profiles"
label variable year "Year"
scatter num_of_lkn_obs year if year>=1975 & year<=2003, ///
	m(+) yaxis(1) xscale(range(1975 2003)) || ///
	scatter num_of_pat_obs year if year>=1975 & year<=2003, ///
	yaxis(1) xscale(range(1975 2003)) xlabel(1975(7)2003) ///
	ytitle("Number of observations") m(Th) mc(green)

//prepare figure 2
bysort year: egen avg_all_lkn_mobility = mean(echgyr)
bysort year: egen avg_lkn_mobility = mean(echgyr) if !mi(echgyr) & !mi(echg360yr)
bysort year: egen avg_pat_mobility = mean(echg360yr) if !mi(echgyr) & !mi(echg360yr)
bysort year: egen std_pat_mobility = sd(echg360yr) if !mi(echgyr) & !mi(echg360yr)
gen m_std_pat_mobility = (std_pat_mobility^2/num_of_pat_obs)^0.5
gen upperbound = avg_pat_mobility + 2 * m_std_pat_mobility
gen lowerbound = avg_pat_mobility - 2 * m_std_pat_mobility
//draw Figure 2 - avg mobility of the data
label variable avg_lkn_mobility "LinkedIn profiles (years covered by both patents and LinkedIn profiles)"
label variable avg_pat_mobility "Harvard Patent Inventor Database (years covered by both patents and LinkedIn profiles)"
label variable avg_all_lkn_mobility "LinkedIn profiles (all years)"
scatter avg_pat_mobility year if year>=1975 & year<=2003, ///
	m(Th) xlabel(1975(7)2003) ylabel(0 (0.1) 0.3) ytitle("Average mobility") mc(green) ///
	|| scatter avg_all_lkn_mobility year if year>=1975 & year<=2003, ///
	m(+) mc(edkblue) ///
	|| scatter avg_lkn_mobility year if year>=1975 & year<=2003, ///
	m(Oh) mc(cranberry)

//draw Figure 4  
//you will fail to reproduce this figure as flag_all is defined based on Table10(a) 
kdensity patfirstyr if flag_all == 1 & patfirstyr>=1980, bwidth(1) lpattern(dash)  ///
addplot(kdensity patfirstyr if flag1 == 1 & patfirstyr>=1980,bwidth(1)) ///
legend(label(1 "Inventors with at least 2 patents") label(2 "Matched inventor-year sample"))  ///
xtitle("First patent year") title("")


*********************
**** supplement *****
*********************

**S2 requires survey data, commands are in another set of do file 

****************Table S3 ******************************
**requires all patent inventors , omitted here 

****************Table S4  *****************************
global ctrvar patfirstyr_ln 
global explvar tenure_ln patyr_ln cites_avg pat_breadth_ln cplxty coinventor_ln sv   
global tech tech1-tech6 
global year  yr81_85 - yr01_05  
global options cluster(lower_id)
global sample (!mi(echgyr) & !mi(echg360yr)) & !mi(patfirstno_ln) 
global report label addstat(Log Likelihood, e(ll), Inventors, e(N_clust)) ///
	dec(3) addtext(5-year f.e., Yes)  drop(yr*) nocons word 
	

// Table S4(a) 
probit echg360yr $explvar $tech $ctrvar $year if $sample, $options
outreg2 using "Supplement", $report ctitle(Patent:probit) replace

// Table S4(b)	
probit echgyr $ctrvar $explvar $tech $year  if $sample, $options
outreg2 using "Supplement", $report ctitle(LinkedIn:probit) append 

***exact fp & fn ***
// Table S4(c)
replace fp = 0.0001 
replace fp = 0.9998 if echgyr == 0 & echg360yr == 1 
replace fn = 0.0001 
replace fn = 0.9998 if echgyr == 1 & echg360yr == 0 
mrprobit echg360yr $explvar $tech $ctrvar $year if $sample, alpha0(fp) alpha1(fn) 
outreg2 using "Supplement", ctitle(Patent:mrprobit) addtext(5-year f.e., Yes) ///
label addstat(Log Likelihood, e(ll)) dec(3) drop(yr*) nocons word append   

	
gen true_flag = 1 if (echgyr==0 & echg360yr == 0) | (echgyr ==1 & echg360yr== 1)	
// Table S4(d)
probit echg360yr $explvar $tech $year $ctrvar if $sample & true_flag == 1 ,$options  
outreg2 using "Supplement", $report ctitle(Patent:probit) append

// Table S4(e)
// predict fn & fp using method 
gen fppr = fp_prd 
gen fnpr = fn_prd 
mrprobit echg360yr $explvar $tech $ctrvar $year if $sample, alpha0(fppr) alpha1(fnpr)
outreg2 using "Supplement", ctitle(Patent:mrprobit PPE) addtext(5-year f.e., Yes) ///
label addstat(Log Likelihood, e(ll)) dec(3) drop(yr*) nocons word append   


*** Table S5-S7 are using data constructed from different method. The command will be exactly 
*** the same as Table 5-7 
